{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "plt.rcParams['font.sans-serif']=['SimHei']\n",
    "plt.rcParams['axes.unicode_minus'] = False\n",
    "import seaborn as sns; sns.set()\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>category</th>\n",
       "      <th>content</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   category                                            content\n",
       "0         2  合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...\n",
       "1         2  公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。\n",
       "2         1  公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...\n",
       "3         2  公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...\n",
       "4         2  该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专..."
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data=pd.read_csv(\"./data/training.csv\",names=['category','content'],encoding='utf-8',header=None)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. 分词（20分）：由于企业描述是文本信息，需要对文本信息进行特征提取。文本分词可采用Jieba分词： "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4774, 2)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 第一步：使用jieba分词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "contents=data.content.values.tolist()# 使用jieba分词器要求传入的数据格式为list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Dumping model to file cache C:\\Users\\sugar\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 0.744 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    }
   ],
   "source": [
    "import jieba\n",
    "content_s=[]\n",
    "for line in contents:\n",
    "    current_segment=jieba.lcut(line)\n",
    "    if len(current_segment)>1 and current_segment!=\"\\n\":\n",
    "        content_s.append(current_segment)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4774,)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.array(content_s).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>content_s</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[合晟, 资产, 是, 一家, 专注, 于, 股票, 、, 债券, 等, 二级, 市场, 投...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[公司, 的, 主营业务, 为, 向, 中小, 微, 企业, 、, 个体, 工商户, 、, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[公司, 立足于, 商业地产, 服务, ，, 致力于, 为, 商业地产, 开发, 、, 销售...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[公司, 经, 工商管理, 部门, 核准, 的, 经营范围, 为, “, 投资, 咨询, 、...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[该, 公司, 的, 主营业务, 为, 在, 中国, 境内, (, 港, 、, 澳, 、, ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           content_s\n",
       "0  [合晟, 资产, 是, 一家, 专注, 于, 股票, 、, 债券, 等, 二级, 市场, 投...\n",
       "1  [公司, 的, 主营业务, 为, 向, 中小, 微, 企业, 、, 个体, 工商户, 、, ...\n",
       "2  [公司, 立足于, 商业地产, 服务, ，, 致力于, 为, 商业地产, 开发, 、, 销售...\n",
       "3  [公司, 经, 工商管理, 部门, 核准, 的, 经营范围, 为, “, 投资, 咨询, 、...\n",
       "4  [该, 公司, 的, 主营业务, 为, 在, 中国, 境内, (, 港, 、, 澳, 、, ..."
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_content=pd.DataFrame({'content_s':content_s})\n",
    "df_content.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 第二步：去掉停用词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stopwords</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>\"</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>#</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>$</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>%</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  stopwords\n",
       "0         !\n",
       "1         \"\n",
       "2         #\n",
       "3         $\n",
       "4         %"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 导入停用词库\n",
    "stopwords=pd.read_csv('./data/stopwords.txt',index_col=False,sep='\\t',quoting=3,names=['stopwords'],encoding='gbk')\n",
    "stopwords.head()\n",
    "# pandas.read_csv参数整理：https://www.cnblogs.com/datablog/p/6127000.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 去掉停用词\n",
    "# 自定义一个函数\n",
    "def drop_stopwords(contents,stopwords):\n",
    "    content_clean=[]\n",
    "    all_words=[]\n",
    "    for line in contents:\n",
    "        line_clean=[]\n",
    "        for word in line:\n",
    "            if word  in stopwords:\n",
    "                continue\n",
    "            line_clean.append(word)\n",
    "            all_words.append(str(word))\n",
    "        content_clean.append(line_clean)\n",
    "    return content_clean,all_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "contents=df_content.content_s.values.tolist()\n",
    "stopwords=stopwords.stopwords.values.tolist()\n",
    "content_clean,all_words=drop_stopwords(contents,stopwords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>contents_clean</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[合晟, 资产, 一家, 专注, 股票, 债券, 二级, 市场, 投资, 合格, 投资者, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[公司, 主营业务, 微, 企业, 个体, 工商户, 农户, 客户, 提供, 贷款, 服务,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[公司, 立足于, 商业地产, 服务, 致力于, 商业地产, 开发, 销售, 运营, 全, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[公司, 工商管理, 部门, 核准, 经营范围, 投资, 咨询, 经济, 信息, 咨询, 企...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[公司, 主营业务, 中国, 境内, 港, 澳, 台, 保险代理, 销售, 依托, 产品, ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                      contents_clean\n",
       "0  [合晟, 资产, 一家, 专注, 股票, 债券, 二级, 市场, 投资, 合格, 投资者, ...\n",
       "1  [公司, 主营业务, 微, 企业, 个体, 工商户, 农户, 客户, 提供, 贷款, 服务,...\n",
       "2  [公司, 立足于, 商业地产, 服务, 致力于, 商业地产, 开发, 销售, 运营, 全, ...\n",
       "3  [公司, 工商管理, 部门, 核准, 经营范围, 投资, 咨询, 经济, 信息, 咨询, 企...\n",
       "4  [公司, 主营业务, 中国, 境内, 港, 澳, 台, 保险代理, 销售, 依托, 产品, ..."
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_content1=pd.DataFrame({'contents_clean':content_clean})\n",
    "df_content1.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "分词任务完成"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. 特征提取（20分）： 去掉停用词后（stopwords.txt），采用TFIDF作为每个文本的特征描述。 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'合晟 资产 一家 专注 股票 债券 二级 市场 投资 合格 投资者 提供 专业 资产 管理 服务 企业 公司 业务范围 包括 资产 管理 投资 咨询 投资 顾问 服务 公司 管理 私募 基金 产品 包括 股票 型 债券 型 资产 管理 计划 证券 投资 基金 管理 总资产 规模 80 亿元 中国 证券 投资 基金业 协会 数据 公司 管理 私募 证券 投资 基金 顾问 管理 类 规模 较大 公司 管理 规模 处于 50 亿元 第一 梯队'"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 准备数据\n",
    "X_train=df_content1.contents_clean\n",
    "words=[]\n",
    "for line_index in range(len(X_train)):\n",
    "    try:\n",
    "        words.append(' '.join(X_train[line_index]))# 将数据格式转化成字符串的格式，因为TfidfVectorizer算法要求是这样的格式\n",
    "    except:\n",
    "        print(line_index)\n",
    "words[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导入工具包,构建tfidf特征\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "Vectorizer=TfidfVectorizer(analyzer='word',max_features=20,lowercase=False)\n",
    "X=Vectorizer.fit_transform(words).toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4774, 20)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>专业</th>\n",
       "      <th>业务</th>\n",
       "      <th>主营业务</th>\n",
       "      <th>产品</th>\n",
       "      <th>企业</th>\n",
       "      <th>公司</th>\n",
       "      <th>包括</th>\n",
       "      <th>客户</th>\n",
       "      <th>技术</th>\n",
       "      <th>提供</th>\n",
       "      <th>收入</th>\n",
       "      <th>服务</th>\n",
       "      <th>生产</th>\n",
       "      <th>研发</th>\n",
       "      <th>系统</th>\n",
       "      <th>经营</th>\n",
       "      <th>行业</th>\n",
       "      <th>设备</th>\n",
       "      <th>销售</th>\n",
       "      <th>领域</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.291244</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.195930</td>\n",
       "      <td>0.222559</td>\n",
       "      <td>0.517844</td>\n",
       "      <td>0.582875</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.224889</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.410823</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.644582</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.353977</td>\n",
       "      <td>0.205906</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.426784</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.357683</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.326704</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.194661</td>\n",
       "      <td>0.510485</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.173058</td>\n",
       "      <td>0.194790</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.150311</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.686462</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.373893</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.433988</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.238328</td>\n",
       "      <td>0.554535</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.240823</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.439930</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.341682</td>\n",
       "      <td>0.281185</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.214825</td>\n",
       "      <td>0.207716</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.274497</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.238416</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.653301</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.593053</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         专业        业务      主营业务        产品        企业        公司        包括  \\\n",
       "0  0.291244  0.000000  0.000000  0.195930  0.222559  0.517844  0.582875   \n",
       "1  0.000000  0.000000  0.644582  0.000000  0.353977  0.205906  0.000000   \n",
       "2  0.194661  0.510485  0.000000  0.000000  0.000000  0.173058  0.194790   \n",
       "3  0.000000  0.000000  0.433988  0.000000  0.238328  0.554535  0.000000   \n",
       "4  0.000000  0.000000  0.214825  0.207716  0.000000  0.274497  0.000000   \n",
       "\n",
       "         客户   技术        提供   收入        服务   生产   研发   系统        经营        行业  \\\n",
       "0  0.000000  0.0  0.224889  0.0  0.410823  0.0  0.0  0.0  0.000000  0.000000   \n",
       "1  0.426784  0.0  0.357683  0.0  0.326704  0.0  0.0  0.0  0.000000  0.000000   \n",
       "2  0.000000  0.0  0.150311  0.0  0.686462  0.0  0.0  0.0  0.000000  0.000000   \n",
       "3  0.000000  0.0  0.240823  0.0  0.439930  0.0  0.0  0.0  0.341682  0.281185   \n",
       "4  0.000000  0.0  0.238416  0.0  0.653301  0.0  0.0  0.0  0.000000  0.000000   \n",
       "\n",
       "    设备        销售   领域  \n",
       "0  0.0  0.000000  0.0  \n",
       "1  0.0  0.000000  0.0  \n",
       "2  0.0  0.373893  0.0  \n",
       "3  0.0  0.000000  0.0  \n",
       "4  0.0  0.593053  0.0  "
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_train=pd.DataFrame(columns=Vectorizer.get_feature_names(),data=X)\n",
    "new_train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "有较多的0，可见得到tfidf的结果是一个非常稀疏的矩阵。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. 采用KMeans聚类算法，根据第2 步得到特征对企业进行聚类， 尝试K=5，10，15，20，30，..., 50, 并选择合适的度量指标，选择最佳的K。（60分） "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导入相应工具包\n",
    "from sklearn.preprocessing import normalize\n",
    "from sklearn.cluster import MiniBatchKMeans\n",
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "#读取数据\n",
    "x_train=new_train\n",
    "# y_train=data.category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.29124426, 0.        , 0.        , ..., 0.        , 0.        ,\n",
       "        0.        ],\n",
       "       [0.        , 0.        , 0.64458154, ..., 0.        , 0.        ,\n",
       "        0.        ],\n",
       "       [0.19466135, 0.51048482, 0.        , ..., 0.        , 0.37389322,\n",
       "        0.        ],\n",
       "       ...,\n",
       "       [0.44410459, 0.        , 0.        , ..., 0.        , 0.14216801,\n",
       "        0.        ],\n",
       "       [0.        , 0.14293407, 0.        , ..., 0.        , 0.        ,\n",
       "        0.16477106],\n",
       "       [0.21820823, 0.19074492, 0.15182084, ..., 0.        , 0.        ,\n",
       "        0.21988628]])"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 因为要计算样本之间的距离\n",
    "# 对每个样本数据进行归一化，每个样本的模长为1.\n",
    "normalize(x_train, norm=\"l2\", copy=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# KMeans聚类\n",
    "# 一个参数点（聚类数据为K）的模型\n",
    "def K_cluster_analysis(K, X):\n",
    "    print(\"K-means begin with clusters: {}\".format(K));\n",
    "    \n",
    "    #K-means,在训练集上训练\n",
    "    mb_kmeans = MiniBatchKMeans(n_clusters = K)\n",
    "    y_pred = mb_kmeans.fit_predict(X)# 这里是fit_predict方法\n",
    "    \n",
    "    # K值的评估标准\n",
    "    #本案例中训练数据有标签，可采用有参考模型的评价指标\n",
    "    #v_score = metrics.v_measure_score(y_val, y_val_pred)\n",
    "    \n",
    "    #亦可采用无参考默的评价指标：轮廓系数Silhouette Coefficient和Calinski-Harabasz Index\n",
    "    #这两个分数值越大则聚类效果越好\n",
    "    CH_score = metrics.calinski_harabaz_score(X, y_pred)\n",
    "    \n",
    "    #轮廓系数Silhouette Coefficient在大样本时计算太慢\n",
    "    #si_score = metrics.silhouette_score(X, y_pred)\n",
    "    \n",
    "    print(\"CH_score: {}\".format(CH_score))\n",
    "    #print(\"si_score: {}\".format(si_score))\n",
    "    \n",
    "    return CH_score#,si_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 5\n",
      "CH_score: 370.62935003991436\n",
      "K-means begin with clusters: 10\n",
      "CH_score: 328.6136775980807\n",
      "K-means begin with clusters: 15\n",
      "CH_score: 278.8099165153815\n",
      "K-means begin with clusters: 20\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\miniconda_anzhuang\\lib\\site-packages\\sklearn\\utils\\deprecation.py:85: DeprecationWarning: Function calinski_harabaz_score is deprecated; Function 'calinski_harabaz_score' has been renamed to 'calinski_harabasz_score' and will be removed in version 0.23.\n",
      "  warnings.warn(msg, category=DeprecationWarning)\n",
      "d:\\miniconda_anzhuang\\lib\\site-packages\\sklearn\\utils\\deprecation.py:85: DeprecationWarning: Function calinski_harabaz_score is deprecated; Function 'calinski_harabaz_score' has been renamed to 'calinski_harabasz_score' and will be removed in version 0.23.\n",
      "  warnings.warn(msg, category=DeprecationWarning)\n",
      "d:\\miniconda_anzhuang\\lib\\site-packages\\sklearn\\utils\\deprecation.py:85: DeprecationWarning: Function calinski_harabaz_score is deprecated; Function 'calinski_harabaz_score' has been renamed to 'calinski_harabasz_score' and will be removed in version 0.23.\n",
      "  warnings.warn(msg, category=DeprecationWarning)\n",
      "d:\\miniconda_anzhuang\\lib\\site-packages\\sklearn\\utils\\deprecation.py:85: DeprecationWarning: Function calinski_harabaz_score is deprecated; Function 'calinski_harabaz_score' has been renamed to 'calinski_harabasz_score' and will be removed in version 0.23.\n",
      "  warnings.warn(msg, category=DeprecationWarning)\n",
      "d:\\miniconda_anzhuang\\lib\\site-packages\\sklearn\\utils\\deprecation.py:85: DeprecationWarning: Function calinski_harabaz_score is deprecated; Function 'calinski_harabaz_score' has been renamed to 'calinski_harabasz_score' and will be removed in version 0.23.\n",
      "  warnings.warn(msg, category=DeprecationWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CH_score: 234.31830069238015\n",
      "K-means begin with clusters: 25\n",
      "CH_score: 194.3539024900104\n",
      "K-means begin with clusters: 30\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\miniconda_anzhuang\\lib\\site-packages\\sklearn\\utils\\deprecation.py:85: DeprecationWarning: Function calinski_harabaz_score is deprecated; Function 'calinski_harabaz_score' has been renamed to 'calinski_harabasz_score' and will be removed in version 0.23.\n",
      "  warnings.warn(msg, category=DeprecationWarning)\n",
      "d:\\miniconda_anzhuang\\lib\\site-packages\\sklearn\\utils\\deprecation.py:85: DeprecationWarning: Function calinski_harabaz_score is deprecated; Function 'calinski_harabaz_score' has been renamed to 'calinski_harabasz_score' and will be removed in version 0.23.\n",
      "  warnings.warn(msg, category=DeprecationWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CH_score: 179.11153484605376\n",
      "K-means begin with clusters: 35\n",
      "CH_score: 161.08756475459091\n",
      "K-means begin with clusters: 40\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\miniconda_anzhuang\\lib\\site-packages\\sklearn\\utils\\deprecation.py:85: DeprecationWarning: Function calinski_harabaz_score is deprecated; Function 'calinski_harabaz_score' has been renamed to 'calinski_harabasz_score' and will be removed in version 0.23.\n",
      "  warnings.warn(msg, category=DeprecationWarning)\n",
      "d:\\miniconda_anzhuang\\lib\\site-packages\\sklearn\\utils\\deprecation.py:85: DeprecationWarning: Function calinski_harabaz_score is deprecated; Function 'calinski_harabaz_score' has been renamed to 'calinski_harabasz_score' and will be removed in version 0.23.\n",
      "  warnings.warn(msg, category=DeprecationWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CH_score: 145.47472454450164\n",
      "K-means begin with clusters: 45\n",
      "CH_score: 135.36190484467372\n",
      "K-means begin with clusters: 50\n",
      "CH_score: 124.78206617774109\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\miniconda_anzhuang\\lib\\site-packages\\sklearn\\utils\\deprecation.py:85: DeprecationWarning: Function calinski_harabaz_score is deprecated; Function 'calinski_harabaz_score' has been renamed to 'calinski_harabasz_score' and will be removed in version 0.23.\n",
      "  warnings.warn(msg, category=DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "# 设置超参数（聚类数目K）搜索范围\n",
    "Ks = [5,10,15,20,25,30,35,40,45,50]\n",
    "CH_scores = []\n",
    "#si_scores = []\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K, x_train)\n",
    "    CH_scores.append(ch)\n",
    "    #si_scores.append(si)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXkAAAD7CAYAAACPDORaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3deXxV9Z3/8de5WW/IvgeSsOcbkgABAQVUXKhLq+OCiEtr7dQ6HVutv0rbmV/7ePiwM79fp1OLWqf+6rSlnRlGoQJaF+xYtdYKuIBA2PJlhyBJgJCwhUAg+f1xbyCxCAm5ybn33Pfz8eAhOffc5MPHm3dOPufc83Xa29sRERFv8rldgIiI9B2FvIiIhynkRUQ8TCEvIuJhCnkREQ+LdbuAThKAiUAtcMrlWkREIkUMUAB8BBz/9IPhFPITgb+4XYSISIS6DHjv0xvDKeRrARobj9LWFrnX7mdlJdPQcMTtMsKG+nGGetGV+tHVhfbD53PIyBgAwQz9tHAK+VMAbW3tER3yQMTXH2rqxxnqRVfqR1e97MdZx9w68Soi4mEKeRERD1PIi4h4mEJeRMTDFPIiIh6mkBcR8TBPhPyGHQf4h2eXs6v+sNuliIiEFU+EfEHWAE6eauMnz69S0IuIdOKJkM9ISeC7d40nIT5GQS8i0oknQh4gN93fJeh31inoRUQ8E/LQNegfn6+gFxHxVMjDmaBPVNCLiHgv5CEQ9N9R0IuIeDPkQUEvIgIeDnnQ6EZExNMhD5CjoBeRKOb5kAcFvYhEr6gIeVDQi0h0ipqQh85BH6ugF5GoEFUhD4Gg/95d4xT0IhIVurWQtzHmh8BtQDvwa2vtHGPMb4BLgaPB3R6z1r5ojJkOzAH8wAJr7Q/6oO5eyQ4G/Y+fW8Xj81cx+45xDM5PcbssEZGQO++RvDFmGnAVMAaYADxojDHBv19ura0M/nnRGOMH5gI3AaOAicaY6/uu/AuXrSN6EYkC5w15a+2fgSuttSeBXAJH/8eAYmCuMabKGPOYMcYHTAI2W2u3B/efB8zsu/J7R0EvIl7XrXGNtbbVGPMYMBt4AYgD3gYeAA4CrwJfBY4AtZ2eWgsU9qSgrKzknuzeazk5Kfz4wcv438+8x08XrOafvj6FEYXpvf6ccob6cYZ60ZX60VVf9KNbIQ9grX3UGPNj4BXgamvtLR2PGWOeBu4BFhKY23dwgLaeFNTQcIS2tvbz7xhCPmD2rEp+/Nwqvv/MUmbfWcmQ/NQL+lw5OSns26ffCDqoH2eoF12pH11daD98PuecB8fdmcmXGmMqAay1zcBiYJYxZkan3RygFdgNFHTang/s6XHVLugY3fgTYnn8+dXsqDvkdkkiIr3WnUsohwG/NMYkGGPiCZxU/TPwpDEmwxgTB9wPvAh8ABhjzAhjTAxwF/B6H9Uech1Bn5SooBcRb+jOidclwGvAKmAlsMxa+0PgR8BSYAOw2lr7vLW2BbgXWBTcXk1ghBMxstP9fPdOBb2IeIPT3t6/8+9zGAJsd2Mmfzb7m47xr8+vornlJI/cUcnQgu7N6DVn7Er9OEO96Er96CoEM/mhwI6/erzXlXlUdrqf7wZHNz+dv5rttTqiF5HIo5A/h+w0Bb2IRDaF/Hko6EUkkinku0FBLyKRSiHfTZ2D/nEFvYhECIV8D3QE/QAFvYhECIV8DynoRSSSKOQvgIJeRCKFQv4CZaf5+d5d4xX0IhLWFPK9kJWWqKAXkbCmkO+lTwf9pl2NbpckInKaQj4EOgf9o/++nPoDzW6XJCICKORDJistkdl3jsNxHJ5cWMWRY61ulyQiopAPpdx0P9//yiQaDh7jmRfXcvJUjxbFEhEJOYV8iJUPy+LL15VSvauJeW9YwuhWziIShbq9xqt039TRBdQ3NvPqsp0UZA3g2knFbpckIlFKId9Hbr5sGHUNzfzu7S3kZSRROTLb7ZJEJAppXNNHfI7DV28oY3B+Cs++vJ5d9VoBR0T6n0K+DyXExfDQbWNISozlqYVVNB057nZJIhJlFPJ9LD05gW/dNobmlpM8vaiK462n3C5JRKKIQr4fFOelcP/flLGj9jC/fnUDbbriRkT6iUK+n4wbmcPMK0ewwu7jpb9sd7scEYkSurqmH107qYi6A0d5ddkO8jP9TKkocLskEfE4Hcn3I8dx+OI1htLidH77ejWbaprcLklEPE4h389iY3w8cMtostL8/NvitextOuZ2SSLiYQp5FyT743j4tjG0t7fz1AtraG456XZJIuJRCnmX5GUm8Y1bRrO38Rj/7/frONWmm5mJSOgp5F1UOjiDL11rWL/9AM+9uVk3MxORkNPVNS67fOxA6g4084cPdlGQmcT0CUVulyQiHqKQDwO3TRtO/YFmnn9rM7kZSYwZnuV2SSLiERrXhAGfz+H+G8spyk3mF79fx+59R9wuSUQ8olshb4z5oTFmgzFmvTHm28Ft040xVcaYzcaYf+60b6UxZoUxZpMx5lfGGP220A0J8TE8NGMMCfExPPVCFYeOnnC7JBHxgPOGvDFmGnAVMAaYADxojBkLzAVuAkYBE40x1wefMg/4prW2BHCAr/VF4V6UmZrIQzPGcLj5BE8vrqL1pG5mJiK9c96Qt9b+GbjSWnsSyCUwx08HNltrtwe3zwNmGmMGA35r7fvBp/8WmNknlXvU0IJUvnZjGVs/OcTcJdW64kZEeqVb4xprbasx5jFgA/AWMBCo7bRLLVB4ju3SAxeZXGZMG8YHG+p5ZekOt8sRkQjW7Xm5tfZRY8yPgVeAEqDzIaYDtBH4oXG27d2WlZXck93DUk5OSq8/x5dvrKDxaCsvvbedkUMyuXxc5P6sDEU/vEK96Er96Kov+nHekDfGlAKJ1trV1tpmY8xi4Dag88A4H9gD7AYKzrK92xoajtDWFrkjipycFPbtC81Sf7OuGM7uukM88fwq4h0YPigtJJ+3P4WyH5FOvehK/ejqQvvh8znnPDjuzrhmGPBLY0yCMSaewMnWZwFjjBlhjIkB7gJet9buBFqMMVODz/0S8HqPqxYA4mJ9fOPW0WSkxPP0oir2H9TNzESkZ7pz4nUJ8BqwClgJLLPWzgfuBRYRmNNXAwuDT7kbeMIYUw0kAz8LfdnRIyUpnodnjqX1VDtPLazi2HHdzExEus8Jo6s3hgDbNa45u/U7DvDEgjVUDMvkoRlj8PmckH+NvqBfyc9QL7pSP7oKwbhmKLDjrx7vdWXSL8qHZHL3NSVUbW1gwdtb3C5HRCKE3o0aQa4cN4i6hmb+uKKG/Kwkrhw3yO2SRCTMKeQjzKyrRlDf2Mx/v7GJ3HQ/5UMz3S5JRMKYxjURxudz+Lu/KWdgdhLPvLSOPfuPul2SiIQxhXwE8ifE8tBtY4iL9fHUwjUcbtbNzETk7BTyESo7zc+DM0bTePgEP1+8ltaTWj5QRP6aQj6CDR+Yxn03jGLT7oP85x90MzMR+Ws68RrhJo3Ko66hmZfe205+VhJfmDzE7ZJEJIwo5D3gxqlDqDvQzKI/byMvI4kJpblulyQiYULjGg9wHIevfL6U4YNS+dWrG9hee8jtkkQkTCjkPSIuNoYHbx1D6oB4fraoigOHWtwuSUTCgELeQ1IHxPPQbWM4fuIUT7ywhiPHWt0uSURcppD3mMKcZB68dTT1B47xxO9W666VIlFOIe9Bo4Zk8sDNFeysO8JTC6s43qoFwUWilULeoypHZnPfjaPYXNPEMy+u4+QpvVlKJBop5D3skrJ87rnOsHZbA//+8npOtSnoRaKNQt7jplUOYtZVI1hh9/Hb16tp07tiRaKK3gwVBa6dVMyx4yd5eekO/PGx3Dl9JI4TGStLiUjvKOSjxE2XDqXlxCne+KiGxIRYbr18mNsliUg/UMhHCcdxmHXVCFpOnOTVZTvwx8dw/SWD3S5LRPqYQj6KOI7DPdeW0nLiFC+8s5XEhFgtISjicQr5KOPzOdx3QxnHT5xi3v9YEuNimFyR73ZZItJHdHVNFIqN8fH3N1dgitP59Wsb+XjTPrdLEpE+opCPUvFxMTw4YwxDClL4xe/XsX77AbdLEpE+oJCPYv6EWB6eOZb8zAE8vbiKzbub3C5JREJMIR/lkv1xPHJHJRnJCTz5whp21h12uyQRCSGFvJA2IJ7Zd4wjKSGWny5YzZ79R90uSURCRCEvAGSlJTL7jnH4fA6Pz1/FvqZjbpckIiGgkJfT8jKTmD2rktaTbfzk+VU0Hj7udkki0ksKeemiMDeZ/3V7JYePtfL4/FUcbj7hdkki0gsKefkrwwam8q0ZY9h/sIU5C9bQ3KLVpUQiVbfe8WqMeRS4Pfjha9ba7xpjfgNcCnScpXvMWvuiMWY6MAfwAwustT8IddHS90oHZ/CNWyp4etFanly4hkduryQhPsbtskSkh857JB8M7WuAcUAlcJEx5hZgAnC5tbYy+OdFY4wfmAvcBIwCJhpjru+78qUvjRmezf1/U87WTw7yby+upfWkFh0RiTTdGdfUAo9Ya09Ya1uBjUBx8M9cY0yVMeYxY4wPmARsttZut9aeBOYBM/uqeOl7E0tzufe6UtZvP8CzWl1KJOKcd1xjrV3f8XdjzEgCY5vLgCuAB4CDwKvAV4EjBH4odKgFCkNXrrjhsrEDaTlxiuff2szc16r56g2j8GnREZGI0O27UBpjyoHXgO9Yay1wS6fHngbuARYCndeXc4AeHfplZSX3ZPewlJOT4nYJIXfX58uIiYth3h+qyUhL5Ou3jun26lJe7MeFUi+6Uj+66ot+dPfE61RgEfCwtXa+MWY0UGKtXRTcxQFagd1AQaen5gN7elJQQ8MR2toidx3SnJwU9u3z5q0BrhxbwP7GZpYs2wFt7dx2xfDzPsfL/egp9aIr9aOrC+2Hz+ec8+D4vCFvjCkCXgJmWWvfDm52gCeNMW8TGNHcD/wH8EHgKWYEsB24i8CJWPEAx3GYecVwWk6cYsn7O/EnxPCFyUPcLktEzqE7R/KzgURgjjGmY9svgB8BS4E4YJG19nkAY8y9BI76E4ElBEY44hGO4/DFa0poOXGSRX/eRmJ8LFdfpNMuIuGqOydevwV86zMefuYs+78FjO1lXRLGfI7D335+FMdPnOK//7iJxPgYpo4uOP8TRaTf6R2vckFiY3x8/aZyyoZkMHfJRlZU73W7JBE5C4W8XLC42BgevHUMwwem8ezL61m7rcHtkkTkUxTy0isJ8TE8PHMMg7IH8PPFa7G7Gt0uSUQ6UchLryUlxvHtOyrJSkvkqYVVbK895HZJIhKkkJeQSE2K55FZlST745izYDW79x1xuyQRQSEvIZSZmsjsOyqJjfXx0/mrqW9sdrskkainkJeQys0IrC51qq2dx59fzb5GLSMo4iaFvITcoJxkvj1rLM3HW/mHZ95jr9aLFXGNQl76xJD8VGbfMY5jLa38y7yV7Nl/9PxPEpGQU8hLnxlakMqPHriUtnb4l//+mJ11uhmVSH9TyEufGlyQyj/ePZ74OB//+vwqtnxy0O2SRKKKQl76XF5mEv9490WkJMXx0/mr2bjjgNsliUQNhbz0i6y0RP7h7vFkpyfyxAtVrN6y3+2SRKKCQl76TXpyAt+7azyFOYFbIHy4sd7tkkQ8TyEv/SrZH8d37hzHsIGpPPvyev5S1aOFw0SkhxTy0u/8CbF8+/ZKyoZk8psl1by5osbtkkQ8SyEvrkiIj+GhGWMYNzKb597czKvLdrhdkognKeTFNXGxPv7+5gouKc9j8bvbWPjOVtrbI3cRd5Fw1J01XkX6TGyMj/tuKCMhLoYl7+/k+IlT3Pm5kfgcx+3SRDxBIS+u8zkO91xrSIyP4X8+rKGl9SRfuX4UPp+CXqS3FPISFhzH4fYrR5AYH8vv39vO8dY27r+xjNgYTRRFekMhL2HDcRxuunQoCXEx/O5PWzjReooHbq4gPi7G7dJEIpYOkyTsXHdxMfdca1i7tYEnX1jDseMn3S5JJGIp5CUsXTFuEPfdUMammoPMWbCaoy2tbpckEpEU8hK2Jlfk8/c3V7Cz/jA/eW4Vh46ecLskkYijkJewdpHJ4aEZY6g70MyPn/uYxsPH3S5JJKIo5CXsVQzL4tuzKmk8fJwfzVup5QRFekAhLxGhpCid79w5jmPHT2o5QZEeUMhLxBhakMr37h6v5QRFekAhLxGlMCdZywmK9IBCXiKOlhMU6b5uvePVGPMocHvww9estd81xkwH5gB+YIG19gfBfSuBXwGpwLvA1621ejeLhFTHcoI/nb+aJ16o4hu3VDB2RLbbZYmEnfMeyQfD/BpgHFAJXGSMuROYC9wEjAImGmOuDz5lHvBNa20J4ABf64vCRdKTE/je3YHlBP9NywmKnFV3xjW1wCPW2hPW2lZgI1ACbLbWbg8epc8DZhpjBgN+a+37wef+FpjZB3WLAFpOUOR8zhvy1tr1HaFtjBlJYGzTRiD8O9QChcDAz9gu0me0nKDIZ+v2XSiNMeXAa8B3gJMEjuY7OASC3we0n2V7t2VlJfdk97CUk5Pidglhpb/68U9fn8K//tcKnntzM7Hxscy8uuT8T+pnem10pX501Rf96O6J16nAIuBha+18Y8w0oKDTLvnAHmD3Z2zvtoaGI7S1Re4ScDk5Kezbp+u3O/R3P/72+lJob+c/l2ykobGZWy8fhhMmq0zptdGV+tHVhfbD53POeXDcnROvRcBLwF3W2vnBzR8EHjIjjDExwF3A69banUBL8IcCwJeA13tctcgF6lhOcFrlQF5bvpPn3txMm9aNlSjWnSP52UAiMMcY07HtF8C9BI7uE4ElwMLgY3cDvzTGpAIfAz8LYb0i5/Xp5QSPnzjF3deUkKDFRyQKnTfkrbXfAr71GQ+PPcv+a4BJvaxLpFc+vZzgR3YvE0pymFyRT2lxhtaPlaih5f/EszqWEywtTmfp2jpW2L0sXVdHRkoCl5TlMbkin8KcyD/RL3IuCnnxPFOcgSnO4IvXlLB6y36Wravjfz6s4fUPdlGUm8zk8nwuKc8jPTnB7VJFQk4hL1EjPi6GSaPymDQqj0NHT/DhxnqWr6/jd3/awgvvbKFsSCaTy/MYX5JDYry+NcQb9EqWqJQ6IJ7pE4qYPqGI2oajLF9fx/J19fzq1Y0kxG1ifEk2kyvyKRucqfm9RDSFvES9gqwB3Hr5cG6+bBhbdh9k2bo6Pqrey/L19aQlx3PxqDymVORTlJscNtfci3SXQl4kyOc4lBSlU1KUzt2fG8maLQ0sX1/HWyt388ZHNQzKGcCU8nwuLssjMzXR7XJFukUhL3IWcbExTCjNZUJpLkeOtZ6e37/wzlYWvrOV0sEZTC7P5yKTgz9B30YSvvTqFDmPZH8cV40v5KrxhdQ3NrN8XR3vr69n7pKNzHvDMq4kh8nleZQPzSTGp3V4JLwo5EV6IC8jiZsvG8ZNlw5l655DLF9Xx4cb6/lgQz2pSXFMKgvM7wfnpWh+L2FBIS9yARzHYcSgNEYMSuPO6SOp2hqY37+z6hPeXLGbgqyk09ffZ6f53S5XophCXqSXYmN8jC/JYXxJDkdbWgNX5qyrY/G721j87jZMUTqTK/K5buowt0uVKKSQFwmhAYlxXFE5iCsqB7Gv6Rjvr69j2fp6fvt6Nc/9cROXlOdx9UVFFOXqdgrSPxTyIn0kJ93PjVOHcsOUIeyoO8yHdh9/WlHDu2tqMUXpTJ9QSOXIbJ2slT6lkBfpY47jMLQglUljBvGFi4t5r6qWt1bu5ucvriMrNYErxxdy+diBJPvj3C5VPEghL9KPkv1xXHdxMddMLGLNlv28uXI3C9/Zyu/f285kjXKkDyjkRVzg8zmMK8lhXEkOu/cd4a2Vu1m+rk6jHAk5hbyIywpzkvnydaXMmDa8yygnMzWBqzTKkV5SyIuECY1ypC8o5EXCjEY5EkoKeZEwplGO9JZCXiQCnGuUc0lZHldfVEhxXorbZUoYUsiLRJDPGuX8pSowyrn6okLGlWiUI2co5EUi1NlGOc+8pFGOdKWQF4lwGuXIuSjkRTxCoxw5G4W8iAeda5RzReUgpo4uICMlwe0ypR8o5EU87NOjnD+uqGHxu9t48d1tlA3JYMroAsaX5JAQF+N2qdJHFPIiUaDzKKe+sZlla+tYtq6OX76ygcT4wKLlUyvyKSlK17KFHqOQF4kyeRlJ3HL5MG66bCibdjWxdF0tH1Xv5b2qWrLTEplSkc+U0QXkpmvZQi9QyItEKZ/jUDo4g9LBGXzxc6dYuWkvS9fW8crSHby8dAclhWlMGV3AxNJc/AmKikil/3MiQkJ8DFMqCphSUUDDwRaWr69j6bq608sWji/JYcrofMoGZ+LzaZwTSbod8saYVGAZcIO1docx5jfApcDR4C6PWWtfNMZMB+YAfmCBtfYHoS5aRPpOVloiN0wZwhcmD2bbnkMsXVfHhxvqeX9DPRkpCUwuz2fq6HwKsga4Xap0Q7dC3hhzMfBLoKTT5gnA5dba2k77+YG5wDSgBnjNGHO9tfb10JUsIv3BcRyGD0pj+KA07rx6BKu3NLB0bS1/+GAXS97fydCCVKaOzmfSqDy9szaMdfdI/mvAN4D/AjDGJAHFwFxjzCDgReAxYBKw2Vq7PbjfPGAmoJAXiWBxsTFMLM1lYmkuB48c5/0N9SxdW8e8NzYx/63NjB2RzdSKAiqGZRIbozdbhZNuhby19j4AY0zHpnzgbeAB4CDwKvBV4AhQ2+mptUBhiGoVkTCQlpzAtZOKuXZSMbvqD7N0bR3vb6hjpd1HSlIcl5QFxjm6lUJ4uKATr9babcAtHR8bY54G7gEWAu2ddnWAtp587qysyF/5JidHL+7O1I8zvNaLnJwULqoYyAOn2vi4ei9vrdjFn1Z9wh9X1DCkIJWrJxYxbXwhGSmJn/l8OaMv+nFBIW+MGQ2UWGsXBTc5QCuwGyjotGs+sKcnn7uh4Qhtbe3n3zFM5eSksG/fYbfLCBvqxxle78XQ3AHc9/lR3HHlCD7cGBjn/Prl9fzmlQ1UDMtk6ugCKkdkERcbeHet1/vRUxfaD5/POefB8YVeQukATxpj3iYworkf+A/gA8AYY0YA24G7CJyIFZEokeyP46rxhVw1vpA9+4+ydF0ty9fVUbW1gaSEWCaV5TG1Ip/s7Mj/rT0SXOi4psoY8yNgKRAHLLLWPg9gjLkXWAQkAksIjHBEJAoNzB7AzCtGMOPy4WzYeSBwO4W1tbyz6hOyX9nAiEGplBSlU1KYTkFWkm6p0Aec9vawGY0MAbZrXOMt6scZ6kXAseMnWVG9l017DrF2y34OHT0BQEpSHCML0wOhX5RGUW5yVN0WOQTjmqHAjk8/rne8iki/8ifEctnYgdw63bB37yH2Nh7D1jSxuaYJW9PEx5v2AZAYH8OIQWnB0E9naEHK6Xm+dJ9CXkRc4zgOeZlJ5GUmcfnYgQAcONTCpt1NbK45yKaaJha/uw2A2BgfwwpSKCkOhP7wgWm6p043qEMiElYyUxO5pCyfS8ryAThyrPX0Uf7m3U0sWb6LV5ftxOc4FOclnz7SH1mYRkpSvMvVhx+FvIiEtWR/3Ol74QO0nDjJ1k8OnR7xvP3xJ7zxUQ0QONFbUnhmxJOZevbr86OJQl5EIkpifCzlQzMpH5oJQOvJNnbUHWJT8Gj//Q31vLM68Pac7LTE04FfUpROXoY/6q7gUciLSESLi/UxsjCdkYXpfGEytLW1U7P3CJtqmthU08TabQ0sW1cHQOqAeEoK0xhZlI4pSqcwJ9nzt05WyIuIp/h8DoPzUxicn8LnJhbR3t5O3YHm06G/qeYgK2zgCh5/QiwjC9MoLc6gdHA6xbkpngt9hbyIeJrjOBRkDaAgawDTKgcB0HAwcAWP3RUY8VRtbQACoW+K0jHF6ZQWZ1CUG/lH+gp5EYk6WWmJTE7LZ3J54AqexsPHsbsaqd7VhN3VyOot+wFISojFFKdjijMoLU6nMDcZX4TN9BXyIhL1MlISuKQ8n0uCoX/gUAu2ponqnY3YXU2s2hwI/QGJsZQUpQfWxi3OYFDOgLAPfYW8iMinZKYmMrn8zJH+gUMtVO9qpHpnE9W7Gk+HfrI/7sx4Z3AGA7PDL/QV8iIi55GZmnh6oXOA/QePYXc1nQ7+lcFbMST7407P80uL0xmYPcD1SzYV8iIiPZSd5id7tJ+po4Oh33SM6o7Q39XIyuDVOylJcafn+aXFGa7caVMhLyLSS9npfi5N93PpmALa29vZd7AFu7PxdPCvqN4LBK7TL+10Ijc/s+9DXyEvIhJCjuOQm+4nN93PZWMHBkK/85H+zkY+3BgI/bQB8ZjidMqGZHLzlSP7pB6FvIhIH3Ich9yMJHIzAnfabG9vZ2/jseBop+l06A8tyqAo0x/yr6+QFxHpR51vrzytchDt7e0cOdbKsMFZfbKoTPQsuyIiEoYcx+nTWyQr5EVEPEwhLyLiYQp5EREPU8iLiHiYQl5ExMMU8iIiHhZO18nHABF/g37wxr8hlNSPM9SLrtSPri6kH52eE3O2x5329vZelBRSlwJ/cbsIEZEIdRnw3qc3hlPIJwATgVrglMu1iIhEihigAPgIOP7pB8Mp5EVEJMR04lVExMMU8iIiHqaQFxHxMIW8iIiHKeRFRDxMIS8i4mEKeRERDwun2xpEJGNMKrAMuMFau8MYMx2YA/iBBdbaH7haYD8yxjwK3B788DVr7XejvB8/BG4D2oFfW2vnRHM/AIwxjwPZ1tp7jTGVwK+AVOBd4OvW2pOuFthPjDF/AnKB1uCmvwOGAz8A4oAnrbU/D8XX0pF8LxhjLibwNuKS4Md+YC5wEzAKmGiMud69CvtPMLyuAcYBlcBFxpg7id5+TAOuAsYAE4AHjTFjidJ+ABhjrga+3GnTPOCb1toSwAG+5kph/cwY4xDIjLHW2kprbSWwG/g/BG7vUgncb4wpC8XXU8j3zteAbwB7gh9PAjZba7cHj0jmATPdKq6f1QKPWGtPWGtbgY0EXshR2Q9r7Z+BK4P/7lwCvzWnE6X9MMZkEgix/xv8eDDgt9a+H9zlt0RJLwAT/O8bxpg1xphvAtOBt621B6y1R4GFBIhiyb0AAAHwSURBVH4L7DWNa3rBWnsfgDEd/88YSCDsOtQChf1cliustes7/m6MGUlgbPM0UdoPAGttqzHmMWA28AJR/PoAngW+DxQFP47mXmQAbwEPEhjNvAMs4K/7MSkUX0xH8qHlIzB/7eAAbS7V4gpjTDnwR+A7wDaivB/W2keBHALhVkIU9sMYcx9QY619q9PmqP1esdYut9beY609aK3dD/wa+CF91A8dyYfWbgJ3g+uQz5lRjucZY6YCi4CHrbXzg3PpqOyHMaYUSLTWrrbWNhtjFhP49bvzHVajpR+zgAJjzGogE0gmEGjR+tq4FEjo9EPPAXbQR/1QyIfWB4AxxowAtgN3ETjR5nnGmCLgJWCWtfbt4Oao7QcwDHgs+A3dTuBk67PAT6KtH9baz3X83RhzL3CFtfYrxph1xpip1tqlwJeA192qsZ+lAz80xkwhMK75MvBFYJ4xJgc4CswA7g/FF9O4JoSstS3AvQSOZjcA1QROoESD2UAiMMcYszp41HYvUdoPa+0S4DVgFbASWGatnU+U9uMz3A08YYypJnB0/zOX6+kX1tpX6framBv8Qfd94E/AauA5a+2Hofh6up+8iIiH6UheRMTDFPIiIh6mkBcR8TCFvIiIhynkRUQ8TCEvIuJhCnkREQ9TyIuIeNj/B3ckRHmrWDqZAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 绘制不同K对应的聚类的性能，找到最佳模型／参数（分数最高）\n",
    "plt.plot(Ks, np.array(CH_scores), 'b-',label = 'CH_scores')\n",
    "\n",
    "### 最佳超参数\n",
    "index = np.unravel_index(np.argmax(CH_scores, axis=None), len(CH_scores))\n",
    "Best_K = Ks[ index[0]]\n",
    "\n",
    "print(Best_K)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "数据实际的类别为10，但这里计算得出的最佳分类为5，误差较大，说明k-means聚类算法在数据集上得到的效果是有限的。样本量不到5000，有可能是数据样本偏少造成的原因，也有可能是通过tfidf提取特征的原因。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
